package maui.filters;

import java.util.*;

import weka.core.*;
import weka.filters.*;
import weka.core.Capabilities.Capability;

/**
 * This filter splits the text in selected string attributes into phrases. The
 * resulting string attributes contain these phrases separated by '\n'
 * characters.
 * 
 * Phrases are identified according to the following definitions:
 * 
 * A phrase is a sequence of words interrupted only by sequences of whitespace
 * characters, where each sequence of whitespace characters contains at most one
 * '\n'.
 * 
 * A word is a sequence of letters or digits that contains at least one letter,
 * with the following exceptions:
 * 
 * a) '.', '@', '_', '&', '/' are allowed if surrounded by letters or
 * digits,
 * 
 * b) '\'' is allowed if preceeded by a letter or digit,
 * 
 * c) '-', '/' are also allowed if succeeded by whitespace characters followed
 * by another word. In that case the whitespace characters will be deleted.
 * 
 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
 * @version 1.0
 */
public class MauiPhraseFilter extends Filter implements OptionHandler {

	/**
	 * 
	 */
	private static final long serialVersionUID = 1L;

	/** Stores which columns to select as a funky range */
	protected Range m_SelectCols = new Range();

	/** Determines whether internal periods are allowed */
	protected boolean m_DisallowInternalPeriods = false;

	/**
	 * Returns a string describing this filter
	 * 
	 * @return a description of the filter suitable for displaying in the
	 *         explorer/experimenter gui
	 */
	public String globalInfo() {
		return "This filter splits the text contained "
				+ "by the selected string attributes into phrases.";
	}

	/**
	 * Returns an enumeration describing the available options
	 * 
	 * @return an enumeration of all the available options
	 */
	public Enumeration<Option> listOptions() {

		Vector<Option> newVector = new Vector<Option>(3);

		newVector.addElement(new Option(
				"\tSpecify list of attributes to process. First and last are valid\n"
						+ "\tindexes. (default none)", "R", 1,
				"-R <index1,index2-index4,...>"));
		newVector
				.addElement(new Option("\tInvert matching sense", "V", 0, "-V"));
		newVector.addElement(new Option("\tDisallow internal periods", "P", 0,
				"-P"));

		return newVector.elements();
	}

	/**
	 * Returns the Capabilities of this filter.
	 * 
	 * @return the capabilities of this object
	 * @see Capabilities
	 */
	public Capabilities getCapabilities() {
		Capabilities result = super.getCapabilities();

		// attributes
		result.enableAllAttributes();
		result.enable(Capability.MISSING_VALUES);

		// class
		result.enable(Capability.NOMINAL_CLASS);
		result.enable(Capability.NO_CLASS);

		return result;
	}

	/**
	 * Parses a given list of options controlling the behaviour of this object.
	 * Valid options are:
	 * <p>
	 * 
	 * -R index1,index2-index4,...<br>
	 * Specify list of attributes to process. First and last are valid indexes.
	 * (default none)
	 * <p>
	 * 
	 * -V<br>
	 * Invert matching sense
	 * <p>
	 * 
	 * -P<br>
	 * Disallow internal periods
	 * <p>
	 * 
	 * @param options
	 *            the list of options as an array of strings
	 * @exception Exception
	 *                if an option is not supported
	 */
	public void setOptions(String[] options) throws Exception {

		String list = Utils.getOption('R', options);
		if (list.length() != 0) {
			setAttributeIndices(list);
		}
		setInvertSelection(Utils.getFlag('V', options));

		setDisallowInternalPeriods(Utils.getFlag('P', options));

		if (getInputFormat() != null) {
			setInputFormat(getInputFormat());
		}
	}

	/**
	 * Gets the current settings of the filter.
	 * 
	 * @return an array of strings suitable for passing to setOptions
	 */
	public String[] getOptions() {

		String[] options = new String[4];
		int current = 0;

		if (getInvertSelection()) {
			options[current++] = "-V";
		}
		if (getDisallowInternalPeriods()) {
			options[current++] = "-P";
		}
		if (!getAttributeIndices().equals("")) {
			options[current++] = "-R";
			options[current++] = getAttributeIndices();
		}

		while (current < options.length) {
			options[current++] = "";
		}
		return options;
	}

	/**
	 * Sets the format of the input instances.
	 * 
	 * @param instanceInfo
	 *            an Instances object containing the input instance structure
	 *            (any instances contained in the object are ignored - only the
	 *            structure is required).
	 * @return true if the outputFormat may be collected immediately
	 */
	public boolean setInputFormat(Instances instanceInfo) throws Exception {

		super.setInputFormat(instanceInfo);
		setOutputFormat(instanceInfo);
		m_SelectCols.setUpper(instanceInfo.numAttributes() - 1);

		return true;
	}

	/**
	 * Input an instance for filtering. Ordinarily the instance is processed and
	 * made available for output immediately. Some filters require all instances
	 * be read before producing output.
	 * 
	 * @param instance
	 *            the input instance
	 * @return true if the filtered instance may now be collected with output().
	 * @exception Exception
	 *                if the input instance was not of the correct format or if
	 *                there was a problem with the filtering.
	 */
	public boolean input(Instance instance) throws Exception {

		if (getInputFormat() == null) {
			throw new Exception("No input instance format defined");
		}
		if (m_NewBatch) {
			resetQueue();
			m_NewBatch = false;
		}
		convertInstance(instance);
		return true;
	}

	/**
	 * Signify that this batch of input to the filter is finished. If the filter
	 * requires all instances prior to filtering, output() may now be called to
	 * retrieve the filtered instances. Any subsequent instances filtered should
	 * be filtered based on setting obtained from the first batch (unless the
	 * inputFormat has been re-assigned or new options have been set). This
	 * default implementation assumes all instance processing occurs during
	 * inputFormat() and input().
	 * 
	 * @return true if there are instances pending output
	 * @exception NullPointerException
	 *                if no input structure has been defined,
	 * @exception Exception
	 *                if there was a problem finishing the batch.
	 */
	public boolean batchFinished() throws Exception {

		if (getInputFormat() == null) {
			throw new NullPointerException("No input instance format defined");
		}
		m_NewBatch = true;
		return (numPendingOutput() != 0);
	}

	/**
	 * Main method for testing this class.
	 * 
	 * @param argv
	 *            should contain arguments to the filter: use -h for help
	 */
	public static void main(String[] argv) {

		try {
			if (Utils.getFlag('b', argv)) {
				Filter.batchFilterFile(new MauiPhraseFilter(), argv);
			} else {
				Filter.filterFile(new MauiPhraseFilter(), argv);
			}
		} catch (Exception ex) {
			System.out.println(ex.getMessage());
		}
	}

	/**
	 * Converts an instance by removing all non-alphanumeric characters from its
	 * string attribute values.
	 */
	private void convertInstance(Instance instance) throws Exception {

		double[] instVals = new double[instance.numAttributes()];

		for (int i = 0; i < instance.numAttributes(); i++) {

			if (!instance.attribute(i).isString() || instance.isMissing(i)) {
				instVals[i] = instance.value(i);
			} else {
				if (!m_SelectCols.isInRange(i)) {
					int index = getOutputFormat().attribute(i).addStringValue(
							instance.stringValue(i));
					instVals[i] = (double) index;
					continue;
				}
				String text = instance.stringValue(i);

				String tokenizedText = tokenize(text);

				int index = getOutputFormat().attribute(i).addStringValue(
						tokenizedText);
				instVals[i] = (double) index;
			}
		}
		Instance inst = new Instance(instance.weight(), instVals);
		inst.setDataset(getOutputFormat());
		push(inst);
	}

	/**
	 * This filter splits the text in selected string attributes into phrases.
	 * The resulting string attributes contain these phrases separated by '\n'
	 * characters.
	 * 
	 * @param text
	 * @return the same text with large tokens separated by \n
	 */
	public String tokenize(String text) {
		StringBuffer result = new StringBuffer();
		int j = 0;
		boolean phraseStart = true;
		boolean seenNewLine = false;
		boolean haveSeenHyphen = false;
		boolean haveSeenSlash = false;
		while (j < text.length()) {
			boolean isWord = false;
			boolean potNumber = false;
			int startj = j;
			while (j < text.length()) {
				char ch = text.charAt(j);
				if (Character.isLetterOrDigit(ch)) {
					potNumber = true;
					if (Character.isLetter(ch)) {
						isWord = true;
					}
					j++;
				} else if ((!m_DisallowInternalPeriods && (ch == '.'))
						|| (ch == '@') || (ch == '_') || (ch == '&')
						|| (ch == '/') || (ch == '\'')) {
					if ((j > 0) && (j + 1 < text.length())
							&& Character.isLetterOrDigit(text.charAt(j - 1))
							&& Character.isLetterOrDigit(text.charAt(j + 1))) {
						j++;
					} else {
						break;
					}
				} else if (ch == '\'') {
					if ((j > 0)
							&& Character.isLetterOrDigit(text.charAt(j - 1))) {
						j++;
					} else {
						break;
					}
				} else {
					break;
				}
			}
			if (isWord == true) {
				if (!phraseStart) {
					if (haveSeenHyphen) {
						result.append(' ');
					} else if (haveSeenSlash) {
						result.append('/');
					} else {
						result.append(' ');
					}
				}
				result.append(text.substring(startj, j));
				if (j == text.length()) {
					break;
				}
				phraseStart = false;
				seenNewLine = false;
				haveSeenHyphen = false;
				haveSeenSlash = false;
				if (Character.isWhitespace(text.charAt(j))) {
					if (text.charAt(j) == '\n') {
						seenNewLine = true;
					}
				} else if (text.charAt(j) == '-') {
					haveSeenHyphen = true;
				} else if (text.charAt(j) == '/') {
					haveSeenSlash = true;
				} else {
					phraseStart = true;
					result.append('\n');
				}
				j++;
			} else if (j == text.length()) {
				break;
			} else if (text.charAt(j) == '\n') {
				if (seenNewLine) {
					if (phraseStart == false) {
						result.append('\n');
						phraseStart = true;
					}
				} else if (potNumber) {
					if (phraseStart == false) {
						phraseStart = true;
						result.append('\n');
					}
				}
				seenNewLine = true;
				j++;
			} else if (Character.isWhitespace(text.charAt(j))) {
				if (potNumber) {
					if (phraseStart == false) {
						phraseStart = true;
						result.append('\n');
					}
				}
				j++;
			} else {
				if (phraseStart == false) {
					result.append('\n');
					phraseStart = true;
				}
				j++;
			}
		}

		return result.toString();
	}

	/**
	 * Returns the tip text for this property
	 * 
	 * @return tip text for this property suitable for displaying in the
	 *         explorer/experimenter gui
	 */
	public String invertSelectionTipText() {

		return "If set to false, the specified attributes will be processed;"
				+ " If set to true, specified attributes won't be processed.";
	}

	/**
	 * Get whether the supplied columns are to be processed
	 * 
	 * @return true if the supplied columns won't be processed
	 */
	public boolean getInvertSelection() {

		return m_SelectCols.getInvert();
	}

	/**
	 * Set whether selected columns should be processed. If true the selected
	 * columns won't be processed.
	 * 
	 * @param invert
	 *            the new invert setting
	 */
	public void setInvertSelection(boolean invert) {

		m_SelectCols.setInvert(invert);
	}

	/**
	 * Returns the tip text for this property
	 * 
	 * @return tip text for this property suitable for displaying in the
	 *         explorer/experimenter gui
	 */
	public String disallowInternalPeriodsTipText() {

		return "If set to false, internal periods are allowed.";
	}

	/**
	 * Get whether the supplied columns are to be processed
	 * 
	 * @return true if the supplied columns won't be processed
	 */
	public boolean getDisallowInternalPeriods() {

		return m_DisallowInternalPeriods;
	}

	/**
	 * Set whether selected columns should be processed. If true the selected
	 * columns won't be processed.
	 * 
	 * @param disallow
	 *            the new invert setting
	 */
	public void setDisallowInternalPeriods(boolean disallow) {

		m_DisallowInternalPeriods = disallow;
	}

	/**
	 * Returns the tip text for this property
	 * 
	 * @return tip text for this property suitable for displaying in the
	 *         explorer/experimenter gui
	 */
	public String attributeIndicesTipText() {

		return "Specify range of attributes to act on."
				+ " This is a comma separated list of attribute indices, with"
				+ " \"first\" and \"last\" valid values. Specify an inclusive"
				+ " range with \"-\". E.g: \"first-3,5,6-10,last\".";
	}

	/**
	 * Get the current range selection.
	 * 
	 * @return a string containing a comma separated list of ranges
	 */
	public String getAttributeIndices() {

		return m_SelectCols.getRanges();
	}

	/**
	 * Set which attributes are to be processed
	 * 
	 * @param rangeList
	 *            a string representing the list of attributes. Since the string
	 *            will typically come from a user, attributes are indexed from
	 *            1. <br>
	 *            eg: first-3,5,6-last
	 */
	public void setAttributeIndices(String rangeList) {

		m_SelectCols.setRanges(rangeList);
	}

	/**
	 * Set which attributes are to be processed
	 * 
	 * @param attributes
	 *            an array containing indexes of attributes to select. Since the
	 *            array will typically come from a program, attributes are
	 *            indexed from 0.
	 */
	public void setAttributeIndicesArray(int[] attributes) {

		setAttributeIndices(Range.indicesToRangeList(attributes));
	}
}
